Pima indian dataset

We will make use of a simulated dataset with \(p = 25\) and \(n = 1000\). We simulate the dataset using the following chunk of code.

Pima <- rbind(MASS::Pima.tr, MASS::Pima.te)
y <- as.numeric(Pima$type == "Yes") # Binary outcome
X <- cbind(1, model.matrix(type ~ . - 1, data = Pima)) # Design matrix

As before, we will employ a relatively vague prior centered at \(0\), namely \(\beta \sim N(0, 100 I_p)\). Then, we implement the log-likelihood, the log-posterior and the gradient of the likelihood.

# Log-likelihood of a logistic regression model
loglik <- function(beta, y, X) {
  eta <- c(X %*% beta)
  sum(y * eta - log(1 + exp(eta)))
}

# Log-posterior
logpost <- function(beta, y, X) {
  loglik(beta, y, X) + sum(dnorm(beta, 0, 10, log = T))
}

# Gradient of the logposterior
lgradient <- function(beta, y, X) {
  probs <- plogis(c(X %*% beta))
  loglik_gr <- c(crossprod(X, y - probs))
  prior_gr <- -beta / 100
  loglik_gr + prior_gr
}

# Summary table
summary_tab <- matrix(0, nrow = 6, ncol = 4)
colnames(summary_tab) <- c("Seconds", "Average ESS", "Average ESS per sec", "Average acceptance rate")
rownames(summary_tab) <- c("MH Laplace + Rcpp", "MALA", "MALA tuned", "HMC", "STAN", "Pólya-Gamma")

Metropolis Hastings (Laplace) and Rcpp

library(coda)
R <- 30000
burn_in <- 5000
set.seed(123)

# Covariance matrix is selected via laplace approximation
fit_logit <- glm(y ~ X - 1, family = binomial(link = "logit"))
p <- ncol(X)
S <- 2.38^2 * vcov(fit_logit) / p

# Running the MCMC
start.time <- Sys.time()
# MCMC
fit_MCMC <- as.mcmc(RMH_arma(R, burn_in, y, X, S)) # Convert the matrix into a "coda" object
end.time <- Sys.time()

time_in_sec <- as.numeric(difftime(end.time, start.time, units = "secs"))

# Diagnostic
summary(effectiveSize(fit_MCMC)) # Effective sample size
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1103    1154    1163    1166    1192    1204 
summary(R / effectiveSize(fit_MCMC)) # Integrated autocorrelation time
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  24.93   25.16   25.79   25.75   26.01   27.19 
summary(1 - rejectionRate(fit_MCMC)) # Acceptance rate
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.2682  0.2682  0.2682  0.2682  0.2682  0.2682 

# Summary statistics
summary_tab[1, ] <- c(
  time_in_sec, mean(effectiveSize(fit_MCMC)),
  mean(effectiveSize(fit_MCMC)) / time_in_sec,
  1 - mean(rejectionRate(fit_MCMC))
)

# Traceplot of the intercept
plot(fit_MCMC[, 1:2])

MALA algoritm

# R represent the number of samples
# burn_in is the number of discarded samples
# epsilon, S are tuning parameter
MALA <- function(R, burn_in, y, X, epsilon, S) {
  p <- ncol(X)
  out <- matrix(0, R, p) # Initialize an empty matrix to store the values
  beta <- rep(0, p) # Initial values
  A <- chol(S) # Cholesky of S
  S1 <- solve(S) # Inverse of S

  lgrad <- c(S %*% lgradient(beta, y, X)) # Compute the gradient
  logp <- logpost(beta, y, X)

  sigma2 <- epsilon^2 / p^(1 / 3)
  sigma <- sqrt(sigma2)

  # Starting the Gibbs sampling
  for (r in 1:(burn_in + R)) {
    beta_new <- beta + sigma2 / 2 * lgrad + sigma * c(crossprod(A, rnorm(p)))

    logpnew <- logpost(beta_new, y, X)
    lgrad_new <- c(S %*% lgradient(beta_new, y, X))

    diffold <- beta - beta_new - sigma2 / 2 * lgrad_new
    diffnew <- beta_new - beta - sigma2 / 2 * lgrad

    qold <- -diffold %*% S1 %*% diffold / (2 * sigma2)
    qnew <- -diffnew %*% S1 %*% diffnew / (2 * sigma2)

    alpha <- min(1, exp(logpnew - logp + qold - qnew))
    if (runif(1) < alpha) {
      logp <- logpnew
      lgrad <- lgrad_new
      beta <- beta_new # Accept the value
    }
    # Store the values after the burn-in period
    if (r > burn_in) {
      out[r - burn_in, ] <- beta
    }
  }
  out
}
set.seed(123)

epsilon <- 0.0017 # After some trial ad error

# Running the MCMC
start.time <- Sys.time()
fit_MCMC <- as.mcmc(MALA(R = R, burn_in = burn_in, y, X, epsilon, S = diag(ncol(X)))) # Convert the matrix into a "coda" object
end.time <- Sys.time()
time_in_sec <- as.numeric(difftime(end.time, start.time, units = "secs"))

# Diagnostic
summary(effectiveSize(fit_MCMC)) # Effective sample size
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  2.900   9.358  27.201  44.321  46.238 166.223 
summary(R / effectiveSize(fit_MCMC)) # Integrated autocorrelation time
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  180.5   728.7  1208.5  2671.3  3283.2 10343.4 
summary(1 - rejectionRate(fit_MCMC)) # Acceptance rate
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.5638  0.5638  0.5638  0.5638  0.5638  0.5638 

# Summary statistics
summary_tab[2, ] <- c(
  time_in_sec, mean(effectiveSize(fit_MCMC)),
  mean(effectiveSize(fit_MCMC)) / time_in_sec,
  1 - mean(rejectionRate(fit_MCMC))
)

# Traceplot of the intercept
plot(fit_MCMC[, 1:2])

MALA algorithm with pre-conditioning

library(coda)
R <- 30000
burn_in <- 5000
set.seed(123)

epsilon <- 1.68 # After some trial ad error

# Covariance matrix is selected via laplace approximation
fit_logit <- glm(y ~ X - 1, family = binomial(link = "logit"))
S <- vcov(fit_logit)

# Running the MCMC
start.time <- Sys.time()
fit_MCMC <- as.mcmc(MALA(R = R, burn_in = burn_in, y, X, epsilon, S)) # Convert the matrix into a "coda" object
end.time <- Sys.time()
time_in_sec <- as.numeric(difftime(end.time, start.time, units = "secs"))

# Diagnostic
summary(effectiveSize(fit_MCMC)) # Effective sample size
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   8583    8762    9196    9063    9312    9409 
summary(R / effectiveSize(fit_MCMC)) # Integrated autocorrelation time
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  3.189   3.222   3.263   3.314   3.424   3.495 
summary(1 - rejectionRate(fit_MCMC)) # Acceptance rate
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.5686  0.5686  0.5686  0.5686  0.5686  0.5686 

# Summary statistics
summary_tab[3, ] <- c(
  time_in_sec, mean(effectiveSize(fit_MCMC)),
  mean(effectiveSize(fit_MCMC)) / time_in_sec,
  1 - mean(rejectionRate(fit_MCMC))
)

# Traceplot of the intercept
plot(fit_MCMC[, 1:2])

Hamiltonian Monte Carlo

HMC <- function(R, burn_in, y, X, epsilon, S, L = 10) {
  
  p <- ncol(X)
  out <- matrix(0, R, p) # Initialize an empty matrix to store the values
  beta <- rep(0, p) # Initial values
  logp <- logpost(beta, y, X) # Initial log-posterior
  S1 <- solve(S)
  A1 <- chol(S1)

  # Starting the Gibbs sampling
  for (r in 1:(burn_in + R)) {
    
    P <- c(crossprod(A1, rnorm(p))) # Auxiliary variables
    logK <- c(P %*% S %*% P / 2) # sum(P^2) / 2 # Kinetic energy at the beginning of the trajectory
    
    # Make a half step for momentum at the beginning
    beta_new <- beta
    Pnew <- P + epsilon * lgradient(beta_new, y, X) / 2
    
    # Alternate full steps for position and momentum
    for (l in 1:L) {
      # Make a full step for the position
      beta_new <- beta_new + epsilon * c(S %*% Pnew)
      # Make a full step for the momentum, except at end of trajectory
      if (l != L) Pnew <- Pnew + epsilon * lgradient(beta_new, y, X)
    }
    # Make a half step for momentum at the end.
    Pnew <- Pnew + epsilon * lgradient(beta_new, y, X) / 2

    # Negate momentum at end of trajectory to make the proposal symmetric
    Pnew <- - Pnew

    # Evaluate potential and kinetic energies at the end of trajectory
    logpnew <- logpost(beta_new, y, X)
    logKnew <- Pnew %*% S %*% Pnew / 2 #sum(Pnew^2) / 2
    
    # Accept or reject the state at end of trajectory, returning either
    # the position at the end of the trajectory or the initial position
    if (runif(1) < exp(logpnew - logp + logK - logKnew)) {
      logp <- logpnew
      beta <- beta_new # Accept the value
    }

    # Store the values after the burn-in period
    if (r > burn_in) {
      out[r - burn_in, ] <- beta
    }
  }
  out
}
set.seed(123)

epsilon <- 0.25 # After some trial ad error
L <- 10

# Covariance matrix is selected via laplace approximation
fit_logit <- glm(y ~ X - 1, family = binomial(link = "logit"))
S <- vcov(fit_logit)

# Running the MCMC
start.time <- Sys.time()
fit_MCMC <- as.mcmc(HMC(R = R, burn_in = burn_in, y, X, epsilon, S, L)) # Convert the matrix into a "coda" object
end.time <- Sys.time()
time_in_sec <- as.numeric(difftime(end.time, start.time, units = "secs"))

# Diagnostic
summary(effectiveSize(fit_MCMC)) # Effective sample size
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 215765  222946  226610  225565  228360  233334 
summary(R / effectiveSize(fit_MCMC)) # Integrated autocorrelation time
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.1286  0.1314  0.1324  0.1331  0.1346  0.1390 
summary(1 - rejectionRate(fit_MCMC)) # Acceptance rate
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.9892  0.9892  0.9892  0.9892  0.9892  0.9892 

# Summary statistics
summary_tab[4, ] <- c(
  time_in_sec, mean(effectiveSize(fit_MCMC)),
  mean(effectiveSize(fit_MCMC)) / time_in_sec,
  1 - mean(rejectionRate(fit_MCMC))
)

# Traceplot of the intercept
plot(fit_MCMC[, 1:2])

Hamiltonian Monte Carlo (Stan)

library(rstan)

# I am not counting the compilation time
stan_compiled <- stan_model(file = "logistic.stan") # Stan program
set.seed(1234)
# Running the MCMC
start.time <- Sys.time()
fit_HMC <- sampling(
  stan_compiled, # The stan file has been previously compiled
  data = list(X = X, y = y, n = nrow(X), p = ncol(X)), # named list of data
  chains = 1, # number of Markov chains
  warmup = burn_in, # Burn-in iterations per chain
  iter = R + burn_in # Total number of iterations per chain
)

SAMPLING FOR MODEL 'logistic' NOW (CHAIN 1).
Chain 1: 
Chain 1: Gradient evaluation took 9.6e-05 seconds
Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.96 seconds.
Chain 1: Adjust your expectations accordingly!
Chain 1: 
Chain 1: 
Chain 1: Iteration:     1 / 35000 [  0%]  (Warmup)
Chain 1: Iteration:  3500 / 35000 [ 10%]  (Warmup)
Chain 1: Iteration:  5001 / 35000 [ 14%]  (Sampling)
Chain 1: Iteration:  8500 / 35000 [ 24%]  (Sampling)
Chain 1: Iteration: 12000 / 35000 [ 34%]  (Sampling)
Chain 1: Iteration: 15500 / 35000 [ 44%]  (Sampling)
Chain 1: Iteration: 19000 / 35000 [ 54%]  (Sampling)
Chain 1: Iteration: 22500 / 35000 [ 64%]  (Sampling)
Chain 1: Iteration: 26000 / 35000 [ 74%]  (Sampling)
Chain 1: Iteration: 29500 / 35000 [ 84%]  (Sampling)
Chain 1: Iteration: 33000 / 35000 [ 94%]  (Sampling)
Chain 1: Iteration: 35000 / 35000 [100%]  (Sampling)
Chain 1: 
Chain 1:  Elapsed Time: 10.9396 seconds (Warm-up)
Chain 1:                67.1344 seconds (Sampling)
Chain 1:                78.0739 seconds (Total)
Chain 1: 
end.time <- Sys.time()
time_in_sec <- as.numeric(difftime(end.time, start.time, units = "secs"))

fit_HMC <- as.mcmc(extract(fit_HMC)$beta)

# Diagnostic
summary(effectiveSize(fit_HMC)) # Effective sample size
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  28917   30000   30000   29865   30000   30000 
summary(R / effectiveSize(fit_HMC)) # Integrated autocorrelation time
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   1.000   1.000   1.005   1.000   1.037 
summary(1 - rejectionRate(fit_HMC)) # Acceptance rate
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      1       1       1       1       1       1 

# Summary statistics
summary_tab[5, ] <- c(
  time_in_sec, mean(effectiveSize(fit_HMC)),
  mean(effectiveSize(fit_HMC)) / time_in_sec,
  1 - mean(rejectionRate(fit_HMC))
)

# Traceplot of the intercept
plot(fit_HMC[, 1:2])

Pólya-gamma data-augmentation

library(BayesLogit)

logit_Gibbs <- function(R, burn_in, y, X, B, b) {
  p <- ncol(X)
  n <- nrow(X)
  out <- matrix(0, R, p) # Initialize an empty matrix to store the values

  P <- solve(B) # Prior precision matrix
  Pb <- P %*% b # Term appearing in the Gibbs sampling

  Xy <- crossprod(X, y - 1 / 2)

  # Initialization
  beta <- rep(0, p)

  # Iterative procedure
  for (r in 1:(R + burn_in)) {

    # Sampling the Pólya-gamma latent variables
    eta <- c(X %*% beta)
    omega <- rpg.devroye(num = n, h = 1, z = eta)

    # Sampling beta
    eig <- eigen(crossprod(X * sqrt(omega)) + P, symmetric = TRUE)

    Sigma <- crossprod(t(eig$vectors) / sqrt(eig$values))
    mu <- Sigma %*% (Xy + Pb)

    A1 <- t(eig$vectors) / sqrt(eig$values)
    beta <- mu + c(matrix(rnorm(1 * p), 1, p) %*% A1)

    # Store the values after the burn-in period
    if (r > burn_in) {
      out[r - burn_in, ] <- beta
    }
  }
  out
}
B <- diag(100, ncol(X)) # Prior covariance matrix
b <- rep(0, ncol(X)) # Prior mean
set.seed(123)

# Running the MCMC
start.time <- Sys.time()
fit_MCMC <- as.mcmc(logit_Gibbs(R, burn_in, y, X, B, b)) # Convert the matrix into a "coda" object
end.time <- Sys.time()
time_in_sec <- as.numeric(difftime(end.time, start.time, units = "secs"))

# Diagnostic
summary(effectiveSize(fit_MCMC)) # Effective sample size
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  10018   13592   15411   15182   17369   18900 
summary(R / effectiveSize(fit_MCMC)) # Integrated autocorrelation time
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.587   1.728   1.950   2.051   2.209   2.995 
summary(1 - rejectionRate(fit_MCMC)) # Acceptance rate
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      1       1       1       1       1       1 

# Summary statistics
summary_tab[6, ] <- c(
  time_in_sec, mean(effectiveSize(fit_MCMC)),
  mean(effectiveSize(fit_MCMC)) / time_in_sec,
  1 - mean(rejectionRate(fit_MCMC))
)


# Traceplot of the intercept
plot(fit_MCMC[, 1:2])

Results

Seconds Average ESS Average ESS per sec Average acceptance rate
MH Laplace + Rcpp 0.561887 1165.75640 2074.71670 0.2682089
MALA 2.743667 44.32131 16.15404 0.5637855
MALA tuned 2.624531 9063.31983 3453.31022 0.5686190
HMC 14.735173 225565.16881 15307.94169 0.9892330
STAN 78.846092 29864.58683 378.77067 1.0000000
Pólya-Gamma 10.254378 15181.91528 1480.53009 1.0000000